{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "\n",
    "seed = 1337\n",
    "\n",
    "torch.manual_seed(seed)\n",
    "torch.cuda.manual_seed_all(seed)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Example 4-1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "class MultilayerPerceptron(nn.Module):\n",
    "    def __init__(self, input_dim, hidden_dim, output_dim):\n",
    "        \"\"\"\n",
    "        Args:\n",
    "            input_dim (int): the size of the input vectors\n",
    "            hidden_dim (int): the output size of the first Linear layer\n",
    "            output_dim (int): the output size of the second Linear layer\n",
    "        \"\"\"\n",
    "        super(MultilayerPerceptron, self).__init__()\n",
    "        self.fc1 = nn.Linear(input_dim, hidden_dim)\n",
    "        self.fc2 = nn.Linear(hidden_dim, output_dim)\n",
    "\n",
    "    def forward(self, x_in, apply_softmax=False):\n",
    "        \"\"\"The forward pass of the MLP\n",
    "        \n",
    "        Args:\n",
    "            x_in (torch.Tensor): an input data tensor. \n",
    "                x_in.shape should be (batch, input_dim)\n",
    "            apply_softmax (bool): a flag for the softmax activation\n",
    "                should be false if used with the Cross Entropy losses\n",
    "        Returns:\n",
    "            the resulting tensor. tensor.shape should be (batch, output_dim)\n",
    "        \"\"\"\n",
    "        intermediate = F.relu(self.fc1(x_in))\n",
    "        output = self.fc2(intermediate)\n",
    "        \n",
    "        if apply_softmax:\n",
    "            output = F.softmax(output, dim=1)\n",
    "        return output"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Example 4-2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MultilayerPerceptron(\n",
      "  (fc1): Linear(in_features=3, out_features=100, bias=True)\n",
      "  (fc2): Linear(in_features=100, out_features=4, bias=True)\n",
      ")\n"
     ]
    }
   ],
   "source": [
    "batch_size = 2 # number of samples input at once\n",
    "input_dim = 3\n",
    "hidden_dim = 100\n",
    "output_dim = 4\n",
    "\n",
    "# Initialize model\n",
    "mlp = MultilayerPerceptron(input_dim, hidden_dim, output_dim)\n",
    "print(mlp)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Example 4-3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def describe(x):\n",
    "    print(\"Type: {}\".format(x.type()))\n",
    "    print(\"Shape/size: {}\".format(x.shape))\n",
    "    print(\"Values: \\n{}\".format(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Type: torch.FloatTensor\n",
      "Shape/size: torch.Size([2, 3])\n",
      "Values: \n",
      "tensor([[0.8329, 0.4277, 0.4363],\n",
      "        [0.9686, 0.6316, 0.8494]])\n"
     ]
    }
   ],
   "source": [
    "# Inputs\n",
    "x_input = torch.rand(batch_size, input_dim)\n",
    "describe(x_input)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Type: torch.FloatTensor\n",
      "Shape/size: torch.Size([2, 4])\n",
      "Values: \n",
      "tensor([[-0.2456,  0.0723,  0.1589, -0.3294],\n",
      "        [-0.3497,  0.0828,  0.3391, -0.4271]], grad_fn=<AddmmBackward>)\n"
     ]
    }
   ],
   "source": [
    "y_output = mlp(x_input, apply_softmax=False)\n",
    "describe(y_output)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Example 4-4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Type: torch.FloatTensor\n",
      "Shape/size: torch.Size([2, 4])\n",
      "Values: \n",
      "tensor([[0.2087, 0.2868, 0.3127, 0.1919],\n",
      "        [0.1832, 0.2824, 0.3649, 0.1696]], grad_fn=<SoftmaxBackward>)\n"
     ]
    }
   ],
   "source": [
    "y_output = mlp(x_input, apply_softmax=True)\n",
    "describe(y_output)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Example 4-13"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MultilayerPerceptron(\n",
      "  (fc1): Linear(in_features=3, out_features=100, bias=True)\n",
      "  (fc2): Linear(in_features=100, out_features=4, bias=True)\n",
      ")\n",
      "Type: torch.FloatTensor\n",
      "Shape/size: torch.Size([2, 4])\n",
      "Values: \n",
      "tensor([[ 0.0193,  0.0275,  0.2319,  0.3032],\n",
      "        [-0.5323,  0.3183,  0.4194, -0.0205]], grad_fn=<AddmmBackward>)\n"
     ]
    }
   ],
   "source": [
    "class MultilayerPerceptron(nn.Module):\n",
    "    def __init__(self, input_dim, hidden_dim, output_dim):\n",
    "        \"\"\"\n",
    "        Args:\n",
    "            input_dim (int): the size of the input vectors\n",
    "            hidden_dim (int): the output size of the first Linear layer\n",
    "            output_dim (int): the output size of the second Linear layer\n",
    "        \"\"\"\n",
    "        super(MultilayerPerceptron, self).__init__()\n",
    "        self.fc1 = nn.Linear(input_dim, hidden_dim)\n",
    "        self.fc2 = nn.Linear(hidden_dim, output_dim)\n",
    "\n",
    "    def forward(self, x_in, apply_softmax=False):\n",
    "        \"\"\"The forward pass of the MLP\n",
    "        \n",
    "        Args:\n",
    "            x_in (torch.Tensor): an input data tensor. \n",
    "                x_in.shape should be (batch, input_dim)\n",
    "            apply_softmax (bool): a flag for the softmax activation\n",
    "                should be false if used with the Cross Entropy losses\n",
    "        Returns:\n",
    "            the resulting tensor. tensor.shape should be (batch, output_dim)\n",
    "        \"\"\"\n",
    "        intermediate = F.relu(self.fc1(x_in))\n",
    "        output = self.fc2(F.dropout(intermediate, p=0.5))\n",
    "        \n",
    "        if apply_softmax:\n",
    "            output = F.softmax(output, dim=1)\n",
    "        return output\n",
    "\n",
    "batch_size = 2 # number of samples input at once\n",
    "input_dim = 3\n",
    "hidden_dim = 100\n",
    "output_dim = 4\n",
    "\n",
    "# Initialize model\n",
    "mlp = MultilayerPerceptron(input_dim, hidden_dim, output_dim)\n",
    "print(mlp)\n",
    "\n",
    "y_output = mlp(x_input, apply_softmax=False)\n",
    "describe(y_output)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Example 4-14"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([2, 10, 7])\n",
      "torch.Size([2, 16, 5])\n"
     ]
    }
   ],
   "source": [
    "batch_size = 2\n",
    "one_hot_size = 10\n",
    "sequence_width = 7\n",
    "data = torch.randn(batch_size, one_hot_size, sequence_width)\n",
    "conv1 = nn.Conv1d(in_channels=one_hot_size, out_channels=16, kernel_size=3)\n",
    "intermediate1 = conv1(data)\n",
    "print(data.size())\n",
    "print(intermediate1.size())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Example 4-15"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([2, 32, 3])\n",
      "torch.Size([2, 64, 1])\n"
     ]
    }
   ],
   "source": [
    "conv2 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3)\n",
    "conv3 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3)\n",
    "\n",
    "intermediate2 = conv2(intermediate1)\n",
    "intermediate3 = conv3(intermediate2)\n",
    "\n",
    "print(intermediate2.size())\n",
    "print(intermediate3.size())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([2, 64])\n"
     ]
    }
   ],
   "source": [
    "y_output = intermediate3.squeeze()\n",
    "print(y_output.size())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor(-0.0493, grad_fn=<SumBackward0>)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "intermediate2.mean(dim=0).mean(dim=1).sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Example 4-16"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([2, 80])\n",
      "torch.Size([2, 16])\n"
     ]
    }
   ],
   "source": [
    "# Method 2 of reducing to feature vectors\n",
    "print(intermediate1.view(batch_size, -1).size())\n",
    "\n",
    "# Method 3 of reducing to feature vectors\n",
    "print(torch.mean(intermediate1, dim=2).size())\n",
    "# print(torch.max(intermediate1, dim=2).size())\n",
    "# print(torch.sum(intermediate1, dim=2).size())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Example 4-22\n",
    "\n",
    "The full model will not be reproduced here. Instead, we will just show batch norm being used.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([2, 16, 5])\n",
      "torch.Size([2, 32, 3])\n",
      "torch.Size([2, 64, 1])\n"
     ]
    }
   ],
   "source": [
    "conv1 = nn.Conv1d(in_channels=one_hot_size, out_channels=16, kernel_size=3)\n",
    "conv2 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3)\n",
    "conv3 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3)\n",
    "\n",
    "conv1_bn = nn.BatchNorm1d(num_features=16)\n",
    "conv2_bn = nn.BatchNorm1d(num_features=32)\n",
    "    \n",
    "intermediate1 = conv1_bn(F.relu(conv1(data)))\n",
    "intermediate2 = conv2_bn(F.relu(conv2(intermediate1)))\n",
    "intermediate3 = conv3(intermediate2)\n",
    "\n",
    "print(intermediate1.size())\n",
    "print(intermediate2.size())\n",
    "print(intermediate3.size())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note: BatchNorm computes its statistics over the batch and sequence dimensions. In other words, the input to each batchnorm1d is a tensor of size `(B, C, L)` (where b=batch, c=channels, and l=length). Each `(B, L)` slice should have 0-mean.  This reduces covariate shift. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([-2.9802e-08,  1.2418e-09,  0.0000e+00, -1.9868e-08, -9.9341e-09,\n",
       "         9.9341e-09,  2.4835e-09, -9.9341e-09, -1.2418e-09, -1.9868e-08,\n",
       "         1.8626e-09,  0.0000e+00, -1.2418e-09,  0.0000e+00,  1.9868e-08,\n",
       "         9.3132e-10, -4.9671e-09, -9.9341e-09,  2.9802e-08,  9.9341e-09,\n",
       "        -9.7013e-11,  9.9341e-09,  1.3970e-09,  0.0000e+00,  9.9341e-09,\n",
       "         9.9341e-09,  1.9868e-08, -4.9671e-09, -1.2418e-09,  4.4703e-08,\n",
       "         4.9671e-09, -5.9605e-08], grad_fn=<MeanBackward0>)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "intermediate2.mean(dim=(0, 2))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "## Bonus Examples\n",
    "\n",
    "In chapter 4, we cover convolutions. Below are code examples which instantiate the convolutions with various hyper parameter settings. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Type: torch.FloatTensor\n",
      "Shape/size: torch.Size([1, 2, 3, 3])\n",
      "Values: \n",
      "tensor([[[[-1.3831,  0.5164,  0.2551],\n",
      "          [-0.4873,  1.1319,  1.4091],\n",
      "          [-1.0097,  1.3822,  2.5432]],\n",
      "\n",
      "         [[-0.5137, -0.1112, -0.7682],\n",
      "          [ 1.0231, -1.3065,  0.2210],\n",
      "          [-0.3294,  0.6213,  2.1973]]]])\n",
      "Type: torch.FloatTensor\n",
      "Shape/size: torch.Size([1, 2, 2, 2])\n",
      "Values: \n",
      "Parameter containing:\n",
      "tensor([[[[-0.0395, -0.2742],\n",
      "          [-0.1382,  0.0440]],\n",
      "\n",
      "         [[-0.2400,  0.1503],\n",
      "          [ 0.1100, -0.1167]]]], requires_grad=True)\n",
      "Type: torch.FloatTensor\n",
      "Shape/size: torch.Size([1, 1, 2, 2])\n",
      "Values: \n",
      "tensor([[[[ 0.5734, -0.2716],\n",
      "          [-0.4697, -0.1801]]]], grad_fn=<MkldnnConvolutionBackward>)\n"
     ]
    }
   ],
   "source": [
    "x = torch.randn(1, 2, 3, 3)\n",
    "describe(x)\n",
    "\n",
    "conv1 = nn.Conv2d(in_channels=2, out_channels=1, kernel_size=2)\n",
    "describe(conv1.weight)\n",
    "describe(conv1(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Type: torch.FloatTensor\n",
      "Shape/size: torch.Size([1, 1, 3, 3])\n",
      "Values: \n",
      "tensor([[[[-0.2682,  0.4390,  1.3682],\n",
      "          [ 0.3038,  0.8558, -0.5000],\n",
      "          [ 1.5619, -0.5929,  0.6817]]]])\n",
      "Type: torch.FloatTensor\n",
      "Shape/size: torch.Size([2, 1, 2, 2])\n",
      "Values: \n",
      "Parameter containing:\n",
      "tensor([[[[-0.0029, -0.3377],\n",
      "          [-0.3707,  0.3836]]],\n",
      "\n",
      "\n",
      "        [[[ 0.2779, -0.3865],\n",
      "          [-0.1691,  0.4410]]]], requires_grad=True)\n",
      "Type: torch.FloatTensor\n",
      "Shape/size: torch.Size([1, 2, 2, 2])\n",
      "Values: \n",
      "tensor([[[[ 0.4943, -0.5463],\n",
      "          [-0.6703,  1.0738]],\n",
      "\n",
      "         [[-0.1458, -0.9997],\n",
      "          [-0.9996,  0.6042]]]], grad_fn=<MkldnnConvolutionBackward>)\n"
     ]
    }
   ],
   "source": [
    "x = torch.randn(1, 1, 3, 3)\n",
    "describe(x)\n",
    "\n",
    "conv1 = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=2)\n",
    "describe(conv1.weight)\n",
    "describe(conv1(x))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "nlpbook",
   "language": "python",
   "name": "nlpbook"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
